<<<<<<< HEAD ======= >>>>>>> parent of b991ebd... proof read commit: Section 3: PCA plots

Section 3: PCA plots

<<<<<<< HEAD ======= >>>>>>> parent of b991ebd... proof read commit:

Arun Seetharam, Ha Vu

Tuteja Lab

2021-08-13

PCA plots

This section uses the counts data generated in Section 1 to perform PCA analyses and plot them.

Prerequisites

Packages required for this analyses are loaded as follows:

setwd("~/github/amnion.vs.other_RNASeq")
library(sva)
library(tidyverse)
library(DESeq2)
library(vsn)
library(pheatmap)
library(ggrepel)
library(RColorBrewer)
library(plotly)

Import datasets

The counts data and its associated metadata (coldata) are imported for analyses

counts = 'assets/new-counts.tsv'
groupFile = 'assets/new-batch.v2.tsv'
coldata <- read.csv(groupFile, row.names=1, sep="\t", stringsAsFactors = TRUE)
cts <- as.matrix(read.csv(counts,sep="\t",row.names="gene.ids"))

Inspect the coldata

DT::datatable(coldata)
<<<<<<< HEAD

reorder columns of cts according to coldata rows. Check if it both files matches.

=======

order and inspect if all the coldata matches the headers of the counts data.

>>>>>>> parent of b991ebd... proof read commit:
colnames(cts)
#>  [1] "BT_EVT_Okae.1"    "BT_SCT_Okae.1"    "BT_TSC_Okae.1"    "BT_EVT_Okae.2"   
#>  [5] "BT_SCT_Okae.2"    "BT_TSC_Okae.2"    "CT_EVT_Okae.1"    "CT_SCT_Okae.1"   
#>  [9] "CT_TSC_Okae.1"    "CT_EVT_Okae.2"    "CT_SCT_Okae.2"    "CT_TSC_Okae.2"   
#> [13] "CT_EVT_Okae.3"    "CT_SCT_Okae.3"    "CT_TSC_Okae.3"    "n_H9.1"          
#> [17] "n_H9.2"           "nTE_D1.1"         "nTE_D1.2"         "nTE_D2.1"        
#> [21] "nTE_D2.2"         "nTE_D3.1"         "nTE_D3.2"         "nCT_P3.1"        
#> [25] "nCT_P3.2"         "nCT_P10.1"        "nCT_P10.2"        "nCT_P15.1"       
#> [29] "nCT_P15.2"        "cR_nCT_P15.1"     "cR_nCT_P15.2"     "nCT_P15_iPSC.1"  
#> [33] "nCT_P15_iPSC.2"   "CT_Okae.1"        "CT_Okae.2"        "nST.1"           
#> [37] "nST.2"            "nEVT.1"           "nEVT.2"           "pH9.1"           
#> [41] "pH9.2"            "pBAP_D1.1"        "pBAP_D1.2"        "pBAP_D2.1"       
#> [45] "pBAP_D2.2"        "pBAP_D3.1"        "pBAP_D3.2"        "CT_7wk.1"        
#> [49] "CT_7wk.2"         "CT_9wk.1"         "CT_11wk.1"        "amnion_18w.1"    
#> [53] "amnion_9w.1"      "amnion_16w.1"     "amnion_16w.2"     "amnion_22w.1"    
#> [57] "amnion_9w.2"      "amnion_22w.2"     "H1_gt70_D8_BAP.1" "H1_gt70_D8_BAP.2"
#> [61] "H1_gt70_D8_BAP.3" "H1_lt40_D8_BAP.1" "H1_lt40_D8_BAP.2" "H1_lt40_D8_BAP.3"
#> [65] "H1_Yabe.1"        "H1_Yabe.2"        "H1_Yabe.3"        "amnion_Term.1"   
#> [69] "amnion_Term.2"    "amnion_Term.3"    "amnion_Term.4"    "amnion_Term.5"   
#> [73] "amnion_Term.6"    "amnion_Term.7"    "amnion_Term.8"    "amnion_Term.9"   
#> [77] "amnion_Term.10"   "amnion_Term.11"   "amnion_Term.12"   "H9_Shao.1"       
#> [81] "H9_Shao.2"        "H9_Shao.3"        "H9_amnion.1"      "H9_amnion.2"     
#> [85] "H9_amnion.3"
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]
cov1 <- as.factor(coldata$authors)
adjusted_counts <- ComBat_seq(cts, batch=cov1, group=NULL)
#> Found 6 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata) %in% colnames(cts))
#> [1] TRUE
cts <- cts[, rownames(coldata)]
dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata,
                              design = ~ group)
<<<<<<< HEAD

transformation

vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA plot (all)

PCA plot for the dataset that includes all libraries.

ggplotly(
  ggplot(pcaData, aes(
    PC1, PC2, color = group.1, shape = authors
  )) +
    scale_shape_manual(values = 1:8) +
    theme_bw() +
    theme(legend.title = element_blank()) +
    geom_point(size = 1, stroke = 1) +
    xlab(paste0("PC1: ", percentVar[1], "% variance")) +
    ylab(paste0("PC2: ", percentVar[2], "% variance"))
)
=======
vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("group", "authors"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplotly(ggplot(pcaData, aes(PC1, PC2, color=group.1, shape=authors)) +
  scale_shape_manual(values=1:8) + 
  theme_bw() +
  theme(legend.title = element_blank(), legend.position = "none") +
  geom_point(size=1, stroke = 1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")))
>>>>>>> parent of b991ebd... proof read commit:

FigX: PCA plots (all samples)

Plot differenciated cells only

setwd("~/github/amnion.vs.other_RNASeq")
<<<<<<< HEAD
counts2 = 'assets/counts-dotted.v3.tsv'
groupFile = 'assets/batch-dotted.v3.tsv'
coldata2 <-
  read.csv(
    groupFile,
    row.names = 1,
    sep = "\t",
    stringsAsFactors = TRUE
  )
cts2 <- as.matrix(read.csv(counts2, sep = "\t", row.names = "gene.ids"))

inspect the coldata

DT::datatable(coldata2)

reorder columns of cts according to coldata rows. Check if it both files matches.

all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

Batch correction

Using combat seq (SVA package) run batch correction - using bioproject ids as variable (dataset origin).

cov1 <- as.factor(coldata2$authors)
adjusted_counts <- ComBat_seq(cts2, batch = cov1, group = NULL)
#> Found 3 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]

run DESeq2

The batch corrected read counts are then used for running DESeq2 analyses

dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata2,
                              design = ~ group)

transformation

vsd <- vst(dds, blind = FALSE)
pcaData <-
  plotPCA(vsd,
          intgroup = c("group", "authors"),
          returnData = TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))

PCA plot (differenticated)

ggplotly(
  ggplot(pcaData, aes(
    PC1, PC2, color = group.1, shape = authors
  )) +
    scale_shape_manual(values = 1:8) +
    theme_bw() +
    theme(legend.title = element_blank()) +
    geom_point(size = 1, stroke = 1) +
    xlab(paste0("PC1: ", percentVar[1], "% variance")) +
    ylab(paste0("PC2: ", percentVar[2], "% variance"))
)
======= counts2 = 'assets/counts-dotted.tsv' groupFile = 'assets/batch-dotted.v2.tsv' coldata2 <- read.csv(groupFile, row.names=1, sep="\t", stringsAsFactors = TRUE) cts2 <- as.matrix(read.csv(counts2,sep="\t",row.names="gene.ids")) all(rownames(coldata2) %in% colnames(cts2)) #> [1] TRUE cts2 <- cts2[, rownames(coldata2)]

Inspect the coldata

DT::datatable(coldata2)
cov1 <- as.factor(coldata2$authors)
adjusted_counts <- ComBat_seq(cts2, batch=cov1, group=NULL)
#> Found 3 batches
#> Using null model in ComBat-seq.
#> Adjusting for 0 covariate(s) or covariate level(s)
#> Estimating dispersions
#> Fitting the GLM model
#> Shrinkage off - using GLM estimates for parameters
#> Adjusting the data
all(rownames(coldata2) %in% colnames(cts2))
#> [1] TRUE
cts2 <- cts2[, rownames(coldata2)]
dds <- DESeqDataSetFromMatrix(countData = adjusted_counts,
                              colData = coldata2,
                              design = ~ group)
vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("group", "authors"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplotly(ggplot(pcaData, aes(PC1, PC2, color=group.1, shape=authors)) +
  scale_shape_manual(values=1:8) + 
  theme_bw() +
  theme(legend.title = element_blank(), legend.position = "none") +
  geom_point(size=1, stroke = 1) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")))
>>>>>>> parent of b991ebd... proof read commit:

FigX: PCA plots (all samples)